Steps and workflow:
0 - Using this notebook
1 - Download data
2 - Explore data
3 - Choose and train a model
4 - Scoring
5 - Make prediction
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
# Time series decomposition
#!pip install stldecompose
#from stldecompose import decompose
from statsmodels.tsa.seasonal import STL
# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)
# Show charts when running kernel
init_notebook_mode(connected=True)
# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'
1 - Download data
We will provide you with two dataset
Training_data will be use to train your model. Hackathon_data will be use to make your prediciton. There is three target you need to provide prediction on: target_r, target_g, target_b.
# Data Download (may take a few minutes depending on your network)
train_datalink_X = 'https://tournament.crunchdao.com/data/X_train.csv'
train_datalink_y = 'https://tournament.crunchdao.com/data/y_train.csv'
hackathon_data_link = 'https://tournament.crunchdao.com/data/X_test.csv'
# Data for training
train_data = pd.read_csv(train_datalink_X)
# Data for which you will submit your prediction
test_data = pd.read_csv(hackathon_data_link)
# Targets use for your supervised trainning
train_targets = pd.read_csv(train_datalink_y)
# If you don't want to work with time series
train_data = train_data.drop(columns=['id'])
#test_data = test_data.drop(columns=['Moons', 'id'])
display(train_data)
display(train_targets)
display(test_data)
| Moons | Feature_1 | Feature_2 | Feature_3 | Feature_4 | Feature_5 | Feature_6 | Feature_7 | Feature_8 | Feature_9 | ... | Feature_19 | Feature_20 | Feature_21 | Feature_22 | Feature_23 | Feature_24 | Feature_25 | Feature_26 | Feature_27 | Feature_28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.00 | 0.25 | 0.25 | 0.00 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | ... | 0.50 | 0.50 | 0.25 | 1.00 | 0.25 | 0.00 | 0.25 | 0.50 | 0.25 | 0.75 |
| 1 | 0 | 1.00 | 0.50 | 0.75 | 0.75 | 0.75 | 0.75 | 0.75 | 0.75 | 1.00 | ... | 0.50 | 0.50 | 0.25 | 0.50 | 0.25 | 0.00 | 0.25 | 1.00 | 0.25 | 0.50 |
| 2 | 0 | 0.25 | 1.00 | 0.00 | 0.00 | 0.75 | 0.75 | 0.25 | 0.25 | 0.75 | ... | 0.50 | 0.75 | 0.00 | 0.25 | 0.25 | 0.75 | 1.00 | 0.25 | 1.00 | 0.75 |
| 3 | 0 | 0.25 | 0.00 | 0.50 | 0.50 | 1.00 | 1.00 | 0.50 | 0.50 | 0.25 | ... | 0.50 | 0.00 | 0.25 | 0.50 | 0.25 | 0.50 | 1.00 | 1.00 | 0.25 | 0.75 |
| 4 | 0 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ... | 0.50 | 1.00 | 0.25 | 0.75 | 0.25 | 0.75 | 0.25 | 0.50 | 0.25 | 0.50 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 67892 | 28 | 1.00 | 1.00 | 0.00 | 0.50 | 0.75 | 0.75 | 0.50 | 0.50 | 0.75 | ... | 0.25 | 0.00 | 0.25 | 1.00 | 1.00 | 1.00 | 0.75 | 0.50 | 0.25 | 0.50 |
| 67893 | 28 | 1.00 | 0.75 | 0.25 | 0.25 | 0.75 | 0.75 | 0.25 | 0.25 | 0.25 | ... | 0.00 | 0.00 | 1.00 | 1.00 | 0.00 | 0.50 | 0.00 | 0.25 | 0.00 | 0.50 |
| 67894 | 28 | 1.00 | 1.00 | 1.00 | 0.25 | 0.00 | 0.00 | 0.75 | 0.75 | 1.00 | ... | 0.25 | 1.00 | 1.00 | 0.25 | 0.00 | 0.50 | 0.25 | 0.25 | 1.00 | 0.25 |
| 67895 | 28 | 1.00 | 1.00 | 0.50 | 0.50 | 0.25 | 0.25 | 0.00 | 0.00 | 0.00 | ... | 1.00 | 1.00 | 0.25 | 1.00 | 1.00 | 1.00 | 0.00 | 0.50 | 0.00 | 0.25 |
| 67896 | 28 | 0.50 | 0.50 | 0.25 | 0.25 | 0.75 | 0.75 | 0.75 | 0.75 | 0.75 | ... | 0.25 | 1.00 | 1.00 | 0.00 | 1.00 | 0.50 | 1.00 | 1.00 | 0.00 | 0.00 |
67897 rows × 29 columns
| target_r | target_g | target_b | |
|---|---|---|---|
| 0 | 0.25 | 0.50 | 0.25 |
| 1 | 0.25 | 0.25 | 0.75 |
| 2 | 0.75 | 0.50 | 0.75 |
| 3 | 0.25 | 0.25 | 0.25 |
| 4 | 0.75 | 1.00 | 1.00 |
| ... | ... | ... | ... |
| 67892 | 0.50 | 0.50 | 0.50 |
| 67893 | 0.25 | 0.00 | 0.25 |
| 67894 | 0.50 | 0.25 | 0.25 |
| 67895 | 0.00 | 0.50 | 0.25 |
| 67896 | 0.00 | 0.00 | 0.25 |
67897 rows × 3 columns
| id | Moons | Feature_1 | Feature_2 | Feature_3 | Feature_4 | Feature_5 | Feature_6 | Feature_7 | Feature_8 | ... | Feature_19 | Feature_20 | Feature_21 | Feature_22 | Feature_23 | Feature_24 | Feature_25 | Feature_26 | Feature_27 | Feature_28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0021949c0f40bf7b8de636d5557a36bd | 0 | 1.00 | 0.75 | 0.75 | 0.75 | 0.50 | 0.0 | 0.75 | 0.0 | ... | 1.00 | 1.00 | 0.00 | 0.75 | 0.25 | 0.75 | 0.25 | 0.00 | 1.00 | 0.75 |
| 1 | 0044a1eb91e6d9e766bfc070a58c091e | 0 | 0.00 | 0.75 | 1.00 | 1.00 | 0.50 | 0.0 | 0.50 | 0.0 | ... | 0.50 | 1.00 | 0.25 | 0.75 | 0.25 | 0.75 | 1.00 | 0.25 | 0.00 | 0.25 |
| 2 | 004b477e43f22d3eb0b7e318badb2438 | 0 | 0.00 | 0.00 | 0.75 | 0.75 | 0.25 | 0.0 | 0.25 | 0.0 | ... | 0.50 | 0.50 | 0.25 | 0.00 | 0.25 | 0.00 | 0.25 | 1.00 | 0.25 | 1.00 |
| 3 | 005e227ee57221853cc374b5d5d79256 | 0 | 0.00 | 0.50 | 0.75 | 0.75 | 0.50 | 0.0 | 0.75 | 0.0 | ... | 0.00 | 0.00 | 0.25 | 0.75 | 0.00 | 0.75 | 1.00 | 1.00 | 1.00 | 1.00 |
| 4 | 006b9832e7683a7ec2b6c3d177c4715a | 0 | 0.00 | 1.00 | 0.75 | 0.75 | 0.25 | 0.0 | 0.25 | 0.0 | ... | 0.25 | 1.00 | 0.25 | 0.75 | 0.00 | 0.75 | 0.25 | 0.25 | 0.75 | 0.25 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2223 | ff9baadf910a626a35599a1c15ddb42a | 0 | 0.75 | 1.00 | 0.50 | 0.50 | 0.25 | 1.0 | 1.00 | 1.0 | ... | 0.00 | 0.50 | 0.00 | 0.50 | 0.00 | 0.25 | 1.00 | 0.50 | 0.75 | 0.75 |
| 2224 | ffafd9c92b60f98962902f7cae7e3b66 | 0 | 0.50 | 0.25 | 0.00 | 0.00 | 0.75 | 1.0 | 0.75 | 1.0 | ... | 1.00 | 0.25 | 1.00 | 0.25 | 1.00 | 0.00 | 0.00 | 1.00 | 0.75 | 1.00 |
| 2225 | ffc01cf716931e050512a9a106150a40 | 0 | 1.00 | 0.50 | 0.00 | 0.00 | 0.75 | 1.0 | 0.00 | 1.0 | ... | 0.00 | 0.75 | 1.00 | 0.00 | 0.00 | 0.00 | 0.75 | 0.25 | 0.75 | 0.25 |
| 2226 | ffe524c3576df2a4b9cb3acd13fbb355 | 0 | 1.00 | 0.25 | 0.25 | 0.25 | 0.50 | 1.0 | 0.50 | 1.0 | ... | 1.00 | 0.50 | 1.00 | 0.25 | 1.00 | 0.00 | 1.00 | 0.75 | 0.75 | 0.25 |
| 2227 | ffe8a6d4e8f7efc676f60b8ae9d8481d | 0 | 0.50 | 0.00 | 0.75 | 0.25 | 0.25 | 1.0 | 0.25 | 1.0 | ... | 1.00 | 0.50 | 1.00 | 1.00 | 1.00 | 1.00 | 0.75 | 0.50 | 0.75 | 0.00 |
2228 rows × 30 columns
display(train_data.describe())
display(train_targets.describe())
| Moons | Feature_1 | Feature_2 | Feature_3 | Feature_4 | Feature_5 | Feature_6 | Feature_7 | Feature_8 | Feature_9 | ... | Feature_19 | Feature_20 | Feature_21 | Feature_22 | Feature_23 | Feature_24 | Feature_25 | Feature_26 | Feature_27 | Feature_28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 67897.00000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | ... | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 | 67897.000000 |
| mean | 13.79625 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | ... | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 | 0.499985 |
| std | 8.46247 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | ... | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 | 0.353621 |
| min | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 6.00000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | ... | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 | 0.250000 |
| 50% | 14.00000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | ... | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 | 0.500000 |
| 75% | 21.00000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | ... | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 | 0.750000 |
| max | 28.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 29 columns
| target_r | target_g | target_b | |
|---|---|---|---|
| count | 67897.000000 | 67897.000000 | 67897.000000 |
| mean | 0.499985 | 0.499985 | 0.499985 |
| std | 0.353621 | 0.353621 | 0.353621 |
| min | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.250000 | 0.250000 | 0.250000 |
| 50% | 0.500000 | 0.500000 | 0.500000 |
| 75% | 0.750000 | 0.750000 | 0.750000 |
| max | 1.000000 | 1.000000 | 1.000000 |
display(train_data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 67897 entries, 0 to 67896 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Moons 67897 non-null int64 1 Feature_1 67897 non-null float64 2 Feature_2 67897 non-null float64 3 Feature_3 67897 non-null float64 4 Feature_4 67897 non-null float64 5 Feature_5 67897 non-null float64 6 Feature_6 67897 non-null float64 7 Feature_7 67897 non-null float64 8 Feature_8 67897 non-null float64 9 Feature_9 67897 non-null float64 10 Feature_10 67897 non-null float64 11 Feature_11 67897 non-null float64 12 Feature_12 67897 non-null float64 13 Feature_13 67897 non-null float64 14 Feature_14 67897 non-null float64 15 Feature_15 67897 non-null float64 16 Feature_16 67897 non-null float64 17 Feature_17 67897 non-null float64 18 Feature_18 67897 non-null float64 19 Feature_19 67897 non-null float64 20 Feature_20 67897 non-null float64 21 Feature_21 67897 non-null float64 22 Feature_22 67897 non-null float64 23 Feature_23 67897 non-null float64 24 Feature_24 67897 non-null float64 25 Feature_25 67897 non-null float64 26 Feature_26 67897 non-null float64 27 Feature_27 67897 non-null float64 28 Feature_28 67897 non-null float64 dtypes: float64(28), int64(1) memory usage: 15.0 MB
None
Moving Averages
train_data['EMA_9'] = train_data['Feature_2'].ewm(9).mean().shift()
train_data['EMA_5'] = train_data['Feature_2'].rolling(5).mean().shift()
train_data['EMA_10'] = train_data['Feature_2'].rolling(10).mean().shift()
train_data['EMA_15'] = train_data['Feature_2'].rolling(15).mean().shift()
train_data['EMA_30'] = train_data['Feature_2'].rolling(30).mean().shift()
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.EMA_9, name='EMA 9'))
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.EMA_5, name='EMA 5'))
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.EMA_10, name='EMA 10'))
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.EMA_15, name='EMA 15'))
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.EMA_30, name='EMA 30'))
fig.show()
RSI
def relative_strength_idx(df, n=14):
close = train_data['Feature_2']
delta = close.diff()
delta = delta[1:]
pricesUp = delta.copy()
pricesDown = delta.copy()
pricesUp[pricesUp < 0] = 0
pricesDown[pricesDown > 0] = 0
rollUp = pricesUp.rolling(n).mean()
rollDown = pricesDown.abs().rolling(n).mean()
rs = rollUp / rollDown
rsi = 100.0 - (100.0 / (1.0 + rs))
return rsi
train_data['RSI'] = relative_strength_idx(train_data).fillna(0)
fig = go.Figure(go.Scatter(x=train_data.Moons, y=train_data.RSI, name='RSI'))
fig.show()
MACD
EMA_12 = pd.Series(train_data['Feature_2'].ewm(span=12, min_periods=12).mean())
EMA_26 = pd.Series(train_data['Feature_2'].ewm(span=26, min_periods=26).mean())
train_data['MACD'] = pd.Series(EMA_12 - EMA_26)
train_data['MACD_signal'] = pd.Series(train_data.MACD.ewm(span=9, min_periods=9).mean())
fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.Feature_1, name='Feature_2'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.Moons, y=EMA_12, name='EMA 12'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.Moons, y=EMA_26, name='EMA 26'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data['MACD'], name='MACD'), row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data['MACD_signal'], name='Signal line'), row=2, col=1)
fig.show()
Split data frame into three subsets: trainin(70%), validation(15%), test(15%)
test_size = 0.15
valid_size = 0.15
test_split_idx = int(train_data.shape[0] * (1-test_size))
valid_split_idx = int(train_data.shape[0] * (1-(valid_size+test_size)))
train_df = train_data.loc[:valid_split_idx].copy()
valid_df = train_data.loc[valid_split_idx+1:test_split_idx].copy()
test_df = train_data.loc[test_split_idx+1:].copy()
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.Moons, y=train_df.Feature_2, name='Training'))
fig.add_trace(go.Scatter(x=valid_df.Moons, y=valid_df.Feature_2, name='Validation'))
fig.add_trace(go.Scatter(x=test_df.Moons, y=test_df.Feature_2, name='Test'))
fig.show()
drop columns not being used
drop_cols = ['Moons', 'Feature_1', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24', 'Feature_25', 'Feature_26', 'Feature_27', 'Feature_28']
train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df = test_df.drop(drop_cols, 1)
Split into features and labels
y_train = train_df['Feature_2'].copy()
X_train = train_df.drop(['Feature_2'], 1)
y_valid = valid_df['Feature_2'].copy()
X_valid = valid_df.drop(['Feature_2'], 1)
y_test = test_df['Feature_2'].copy()
X_test = test_df.drop(['Feature_2'], 1)
X_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 47528 entries, 0 to 47527 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 EMA_9 47527 non-null float64 1 EMA_5 47523 non-null float64 2 EMA_10 47518 non-null float64 3 EMA_15 47513 non-null float64 4 EMA_30 47498 non-null float64 5 RSI 47527 non-null float64 6 MACD 47503 non-null float64 7 MACD_signal 47495 non-null float64 dtypes: float64(8) memory usage: 2.9 MB
%%time
parameters = {
'n_estimators': [400],
'learning_rate': [0.05],
'max_depth': [15],
'gamma': [0.02],
'random_state': [42]
}
eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror')
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train)
print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')
[15:38:44] WARNING: ..\src\learner.cc:541:
Parameters: { eval_set } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[15:39:25] WARNING: ..\src\learner.cc:541:
Parameters: { eval_set } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[15:40:27] WARNING: ..\src\learner.cc:541:
Parameters: { eval_set } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[15:41:28] WARNING: ..\src\learner.cc:541:
Parameters: { eval_set } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[15:42:31] WARNING: ..\src\learner.cc:541:
Parameters: { eval_set } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[15:43:36] WARNING: ..\src\learner.cc:541:
Parameters: { eval_set } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
Best params: {'gamma': 0.02, 'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 400, 'random_state': 42}
Best validation score = 0.8524452403640194
Wall time: 6min 12s
%%time
model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror')
model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
Wall time: 1min 16s
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0.02, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.05, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=400, n_jobs=12, num_parallel_tree=1, random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
plot_importance(model);
Calculate and predict prices
y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')
y_true = [0.25 0.5 0. 0. 0. ] y_pred = [ 0.13695318 0.3457684 0.06881184 0.0295731 -0.01050354]
print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')
mean_squared_error = 0.018134200519696882
predicted_prices = train_data.loc[test_split_idx+1:].copy()
predicted_prices['Feature_1'] = y_pred
fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=train_data.Moons, y=train_data.Feature_1,
name='Truth',
marker_color='LightSkyBlue'), row=1, col=1)
fig.add_trace(go.Scatter(x=predicted_prices.Moons,
y=predicted_prices.Feature_1,
name='Prediction',
marker_color='MediumPurple'), row=1, col=1)
fig.add_trace(go.Scatter(x=predicted_prices.Moons,
y=y_test,
name='Truth',
marker_color='LightSkyBlue',
showlegend=False), row=2, col=1)
fig.add_trace(go.Scatter(x=predicted_prices.Moons,
y=y_pred,
name='Prediction',
marker_color='MediumPurple',
showlegend=False), row=2, col=1)
fig.show()